import csv, argparse, os
import statsmodels.api
import scipy.stats


# generic fail method
def fail(msg):
    print(msg)
    os._exit(1)


# construct list of year periods
def build_year_list(increment, range_years, periods, yrange_max, yrange_min):
    if not periods:
        num_elements = int(((yrange_max - yrange_min) / increment))
        year_list = [None] * num_elements
        i = 0
        for num in range(yrange_min, yrange_max, increment):
            year_list[i] = num
            i += 1
    else:
        num_elements = len(range_years)
        year_list = [None] * num_elements
        i = 0
        for num in range_years:
            year_list[i] = int(num)
            i += 1
    return sorted(year_list)


# simplest dict with numbers as entries
def build_simple_dict_of_nums(year_list):
    results = {}
    for year in year_list:
        results[year] = 0
    return results


# helper method to group docs into periods
def determine_year(year, year_list):
    # determine which period it falls within
    for i in range(len(year_list)):
        if year_list[i] <= year < year_list[i + 1]:
            # the year / period this document belongs in
            target = year_list[i]
            return target
        if year >= year_list[len(year_list) - 1]:
            # case when the document belongs in the last year / period of the list
            target = year_list[len(year_list) - 1]
            return target
        else:
            continue


# set up parameters for year range, depending on whether user is
# searching for fixed increments or specific periods of years
def year_parameters(range_years, periods):
    # if periods flag is not set, set up variables for fixed increments
    if not periods:
        yrange_min = int(range_years[0])
        increment = int(range_years[2])
        difference = int(range_years[1]) - yrange_min
        mod_val = difference % increment

        # adjust list of years so the end bit doesn't get cut out
        if mod_val != 0:
            yrange_max = int(range_years[1]) + (increment - mod_val) + increment
        else:
            yrange_max = int(range_years[1]) + increment
    # set up variables for periods rather than fixed increments
    else:
        yrange_min = int(range_years[0])
        yrange_max = int(range_years[len(range_years) - 1])
        increment = 0
    return[increment, yrange_min, yrange_max]


def build_samples(csv_inpt, year_list, yrange_min, yrange_max):
    # set up observation and sample size dicts
    p = build_simple_dict_of_nums(year_list)
    n = build_simple_dict_of_nums(year_list)
    with open(csv_inpt + '.tab', 'r') as csv_file:
        read_csv = csv.reader(csv_file, delimiter='\t')
        row1 = next(read_csv)
        # this column is populated if the csv file stores word frequencies
        if row1[-1] == "total words":
            binary = False
        else:
            binary = True
        print("Building a set of samples")
        for row in read_csv:
            if row[0] != "filename":
                year = int(row[1])
                # check to make sure it's within range specified by user
                if yrange_min <= year < yrange_max:
                    # determine which period it falls within
                    target = determine_year(year, year_list)
                    try:
                        if binary:
                            # one more volume to sample size w/r/t year period
                            n[target] += 1
                        else:
                            # add total words to sample size w/r/t year period
                            n[target] += int(row[-1])
                    except KeyError:
                        pass
                    for cell in row[2:-1]:
                        if binary:
                            if cell == "1":
                                try:
                                    # add one to observation dict and break
                                    p[target] += 1
                                    break
                                except KeyError:
                                    pass
                        else:
                            try:
                                # add frequency in this cell to observation dict
                                p[target] += int(cell)
                            except KeyError:
                                pass
    return [p, n]


def diff_props_test(k1, n1, k2, n2):
    # Documentation:
    # http://statsmodels.sourceforge.net/devel/generated/statsmodels.stats.proportion.proportions_ztest.html

    # Example:
    # http://knowledgetack.com/python/statsmodels/proportions_ztest/

    (z, p_value) = statsmodels.api.stats.proportions_ztest([k1, k2], [n1, n2], alternative='two-sided', prop_var=False)
    return [z, p_value]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-csv1", help="input csv files argument", action="store")
    parser.add_argument("-csv2", help="input csv files argument", action="store")
    parser.add_argument("-txt", help="output text filepath", action="store")
    parser.add_argument("-y", help="min/max for year range and increment value, surround with quotes",
                        action="store")
    parser.add_argument("-p", help="boolean to analyze by different periods rather than a fixed increment value",
                        action="store_true")

    try:
        args = parser.parse_args()
    except IOError as msg:
        fail(msg)

    csv_files = [args.csv1, args.csv2]

    if args.txt is None:
        fail("Please enter output text file path.")

    periods = args.p

    range_years = args.y.split()
    year_params = year_parameters(range_years, periods)
    increment, yrange_min, yrange_max = year_params[0], year_params[1], year_params[2]

    year_list = build_year_list(increment, range_years, periods, yrange_max, yrange_min)

    first = build_samples(csv_files[0], year_list, yrange_min, yrange_max)
    x1 = first[0]
    n1 = first[1]
    second = build_samples(csv_files[1], year_list, yrange_min, yrange_max)
    x2 = second[0]
    n2 = second[1]

    diff_props = []
    critical = scipy.stats.norm.ppf(1-(0.05/2))

    # calculate chi-squared and p values
    for year in year_list:
        vals = diff_props_test(x1[year], n1[year], x2[year], n2[year])
        z = vals[0]
        p_val = vals[1]
        significance = scipy.stats.norm.cdf(z)
        diff_props.append((z, p_val, significance))

    with open(args.txt + '.txt', 'w') as txt_out:
        for i in range(len(diff_props) - 1):
            txt_out.write("Period: {0} - {1}".format(str(year_list[i]), str(year_list[i+1])) + "\n")
            txt_out.write("Z-score: " + str(diff_props[i][0]) + "\n")
            txt_out.write("P value: " + str(diff_props[i][1]) + "\n")
            txt_out.write("Significance: " + str(diff_props[i][2]) + "\n")
            txt_out.write("Critical: " + str(critical) + "\n\n")

if __name__ == '__main__':
    main()
